import os
from datetime import datetime
import time
from tqdm import tqdm
import pandas as pd
# import spacy
import re
from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance, KeyBERTInspired
from sentence_transformers import SentenceTransformer
# from umap import UMAP
from cuml import UMAP
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
# from hdbscan import HDBSCAN
from cuml.cluster.hdbscan import HDBSCAN
import plotly.io as pio
pio.renderers.default = "notebook+vscode+jupyterlab"
sns.set_theme(style="darkgrid")
# %config InlineBackend.figure_format = "retina"Dynamic Topic Modelling of r/politics subreddit
This project allows:
- To clean gathered data and explore it.
- To extract the main topics from gathered data and visualise them.
- To visualise dynamic changes of topics over time.
To extract topics, we use BERTopic library, which performs topic modeling using clustering of vector representations of documents. The main differences between BERTopic and other topic models:
- High speed due to reducing the dimensionality of vector representations.
- Modular structure of the model pipeline: the stages of vectorization, dimensionality reduction and clustering are separated from each other, which allows you to easily and quickly experiment with different combinations of algorithm settings.
- The model pipeline consists of SOTA tools: SBERT, UMAP, HDBSCAN. Combined, this allows you to get the best results compared to other models.
This project can be easily adjusted to other sources of information, which allows you to conduct different experiments.
Install libraries
We use cuml implementation of HDBSCAN and UMAP to speed up dimensionality reduction and clustering of data with a power of GPU.
Load and clean data from csvs
BERTopic uses SBERT. The model learns better if it receives more information from the text. Therefore, data preprocessing is minimal.
Function to clean data from HTML elements using regular expressions
def regex_preprocessing(text):
# Remove URL
text = re.sub(
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
" ",
text,
)
text = re.sub(
r"\(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+\)",
" ",
text,
)
# Remove special symbols
text = re.sub(r"\n|\r|\<.*?\>|\{.*?\}|u/|\(.*emote.*\)|\[gif\]|/s|_", " ", text)
text = re.sub(r"[^\w0-9'’“”%!?.,-:*()><]", " ", text)
# Remove unnecessary brackets
text = re.sub(r"\s\(\s", " ", text)
# Delete unnecessary whitespaces
text = re.sub(r"\s+", " ", text)
return text.strip()Function to convert data to a dataframe, drop duplicates in the dataframe and to apply ‘regex_preprocessing’ function to data
def data_preprocessing(file_name):
data = pd.read_csv(file_name)
data_cleaned = data.drop_duplicates(keep=False)
data_cleaned["comments"] = data_cleaned["comments"].apply(regex_preprocessing)
return data_cleanedFunction to create a dataframe with a cleaned data
This function consists of several steps:
- Firstly, it gets names of csv files in a chosen folder
- Secondly, it applies ‘data_preprocessing’ function to csv’s to create dataframes with cleaned data
- Lastly, it creates a combined dataframe with cleaned data
def process_data(directory):
file_names = []
for filename in os.listdir(directory):
file = os.path.join(directory, filename)
file_names.append(file)
file_names.sort()
dataframes = []
for name in file_names:
dataframes.append(data_preprocessing(name))
cleaned_df = (
pd.concat(dataframes)
.drop(columns="time", axis=1)
.reset_index(drop=True)
.drop_duplicates()
.dropna()
)
return cleaned_dfApply data processing functions to gathered data
For this experiment, we load cvs’s with data marked as ‘hot’ by reddit algorithms.
directory = "original_data/hot"
combined_df = process_data(directory)
len(combined_df["comments"].to_list())264230
Convert the dataframe to a list for a further work
comments = combined_df["comments"].to_list()
timestamps = combined_df["date"].to_list()Create embeddings from cleaned data
The gte-small model was chosen using the Hugging Face benchmark. It is lightweight and works well with data from Reddit.
# Pre-calculate embeddings
# sentence-transformers/gtr-t5-large
# Alibaba-NLP/gte-base-en-v1.5
# thenlper/gte-base
# thenlper/gte-small
embedding_model = SentenceTransformer(
model_name_or_path="thenlper/gte-small",
cache_folder="transformers_cache",
trust_remote_code=True,
)
embeddings = embedding_model.encode(comments, show_progress_bar=True)Plot data distribution
We use UMAP to reduce the dimensionality of data, which makes it easier to cluster it using HDBSCAN.
def plot_umap(embeddings, values):
neighbors_list = values
fig, axes = plt.subplots(2, 5, figsize=(27, 10), sharex=True, sharey=True)
axes = axes.flatten()
for ax, neighbors in tqdm(zip(axes, neighbors_list)):
umap_model = UMAP(
n_neighbors=neighbors, n_components=2, min_dist=0.0, metric="cosine"
)
# Apply UMAP to our data
umap_result = umap_model.fit_transform(embeddings)
# Visualise the results
ax.scatter(
umap_result[:, 0], umap_result[:, 1], alpha=0.15, c="orangered", s=0.1
)
ax.set_title(f"UMAP, n_neighbors = {neighbors}")
ax.set_xlabel("component 1")
ax.set_ylabel("component 2")
lim = 7
plt.ylim(-lim, lim)
plt.xlim(-lim, lim)
plt.tight_layout()
plt.show()
def plot_hdbscan(embeddings, umap_values, hdbscan_values):
for n in umap_values:
# Apply UMAP to our data
umap_model = UMAP(n_neighbors=n, n_components=2, min_dist=0.0, metric="cosine")
umap_result = umap_model.fit_transform(embeddings)
# HDBSCAN
sizes = hdbscan_values
fig, axes = plt.subplots(1, 4, figsize=(20, 5), sharex=True, sharey=True)
axes = axes.flatten()
for ax, size in tqdm(zip(axes, sizes)):
# Cluster data with HDBSCAN
hdbscan_model = HDBSCAN(
min_cluster_size=size, metric="euclidean", prediction_data=True
)
hdbscan_labels = hdbscan_model.fit_predict(umap_result)
# Create a dataframe with results of UMAP and HDBSCAN
df = pd.DataFrame(
umap_result, columns=[f"UMAP{i+1}" for i in range(0, 2, 1)]
)
df["Cluster"] = hdbscan_labels
# scatterplot for results
sns.scatterplot(
x="UMAP1",
y="UMAP2",
hue="Cluster",
data=df,
palette="tab10",
legend=None,
linewidth=0,
s=0.5,
ax=ax,
).set_title(f"n_neighbors={n}, min_cluster_size={size}")
ax.set_xlabel("component 1")
ax.set_ylabel("component 2")
lim = 7
plt.ylim(-lim, lim)
plt.xlim(-lim, lim)
plt.tight_layout()
plt.show()We plot a range of values to see how a structure of data changes: from a more local structure to a global one. For topic modelling, it is better to focus on a more local view of data for a more precise topic clustering.
plot_umap(embeddings, np.arange(10, 56, 5))10it [01:02, 6.29s/it]

We can see sizes of created clusters with different parameter combinations. Blue dots are data marked as a noise. Generally, the higher UMAP and HDBSCAN parameters, the higher the size of clusters.
plot_hdbscan(embeddings, [15, 20, 25], [15, 35, 50, 75])4it [01:45, 26.42s/it]

4it [01:52, 28.13s/it]

4it [01:42, 25.57s/it]

Extract topics using BERTopic
Function for Topic Modelling Pipeline
This function encapsulates all previous steps in a one pipeline: creation of embeddings, dimension reduction and clustering of data. There are two new steps added.
We use CountVectorizer from Scikit-learn to:
- remove very rare and frequent words from the final topic representations;
- create n-grams, up to 3 words in total;
- remove stopwords from topic representations;
We use representation models, such as KeyBERTInspired or MaximalMarginalRelevance, to further fine tuning topic representations.
- MaximalMarginalRelevance model changes the order of words in topics to remove semantic repetitions and create a sequence of the most significant words.
- KeyBERTInspired creates topic representations with words most similar to corresponding documents.
def topic_modelling(n_neighbors, min_cluster_size, representation_name):
# UMAP init
umap_model = UMAP(
n_neighbors=n_neighbors, n_components=5, min_dist=0.0, metric="cosine"
)
# HDBSCAN init
hdbscan_model = HDBSCAN(
min_cluster_size=min_cluster_size, metric="euclidean", prediction_data=True
)
# Remove noise from created topics
vectorizer_model = CountVectorizer(
stop_words="english", min_df=0.03, max_df=0.99, ngram_range=(1, 3)
)
# BERTopic model init
if representation_name == "KeyBERTInspired":
representation_name = KeyBERTInspired()
else:
representation_name = MaximalMarginalRelevance()
representation_model = representation_name
topic_model = BERTopic(
embedding_model=embedding_model,
umap_model=umap_model,
hdbscan_model=hdbscan_model,
vectorizer_model=vectorizer_model,
representation_model=representation_model,
verbose=True,
)
# Fit the model
topics, probs = topic_model.fit_transform(comments, embeddings)
# Save topics to a dataframe
topic_representation = topic_model.get_topic_info()
# 2D-map of topics
topics_map = topic_model.visualize_topics()
# Hierarchical visualisation of topics
hierarchical_topics = topic_model.hierarchical_topics(comments)
topics_hierarchy = topic_model.visualize_hierarchy(
hierarchical_topics=hierarchical_topics
)
# Get topics over time
topics_over_time = topic_model.topics_over_time(
comments,
timestamps,
datetime_format="%Y_%m_%d",
global_tuning=False,
evolution_tuning=False,
)
# Plot Topics over Time
plot = topic_model.visualize_topics_over_time(
topics_over_time, top_n_topics=15, height=700, width=1200
)
return (
topics,
probs,
topic_representation,
topics_map,
topics_hierarchy,
topics_over_time,
plot,
)Experiments
This part is purely experimental and required a lot of time to tune hyperparameters of model to get the best ouput results. This is one of the main problems of topic modelling. There is no metric for helping us to choose the best hyperparameters. Also, the best result of modelling may be subjective. That is why we run a series of experiments to have several results.
Generally, hyperparameters should be chosen taking into account several goals:
- To preserve the local structure of the data after reducing the dimensionality of the data with UMAP.
- To reduce the amount of noise in clusters and create an adequate number of topics with HDBSCAN.
- To create a list of understandable topics at the output.
After several rounds of experiments, the best values for UMAP n_neighbors parameter are 25 and 30. The best values for HDBSCAN min_cluster_size are 75 and 100. The best topic representation model is KeyBERTInspired.
Because of the subjectivity of final results, below you can see results of model runs with a set of chosen parameters.
You can click on labels in ‘Topics over Time’ plots to hide some topics for an easier analysis. For a better visualisation purposes, ‘Topics over Time’ plots have only top 15 extracted topics presented.
Noise and odd data can be placed in their own clusters. That is totally normal behavior of HDBSCAN. Ignore obscure topics.
UMAP 25, HDBSCAN 75
(
topics,
probs,
topic_representation,
topics_map,
topics_hierarchy,
topics_over_time,
plot,
) = topic_modelling(25, 75, "KeyBERTInspired")
pio.renderers.default = "notebook+vscode+jupyterlab"
plot2024-12-16 05:29:39,747 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-16 05:29:56,072 - BERTopic - Dimensionality - Completed ✓
2024-12-16 05:29:56,075 - BERTopic - Cluster - Start clustering the reduced embeddings
[I] [05:29:46.239227] Transform can only be run with brute force. Using brute force.
2024-12-16 05:30:43,610 - BERTopic - Cluster - Completed ✓
2024-12-16 05:30:43,639 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-16 05:31:03,527 - BERTopic - Representation - Completed ✓
100%|██████████| 121/121 [00:21<00:00, 5.65it/s]
16it [01:30, 5.68s/it]
topics_maptopics_hierarchytopic_representation["Name"].to_list()['-1_democrats_trump_election_republicans',
'0_thank_thanks_answer_years',
'1_bernie_bernie sanders_democrats_republicans',
'2_inflation_economics_tariffs_gas prices',
'3_vote_vote vote_voting_vote vote vote',
'4_elon musk_elon_like elon_musk',
'5_voted harris_voting harris_vote harris_harris did',
'6_hillary_candidate_election_campaign',
'7_russia_russians_putin_like russia',
'8_women_think women_women vote_hate women',
'9_hitler_like hitler_nazi_fascist',
'10_gaza_israel_palestinians_palestine',
'11_rfk jr_rfk_kennedy_anti vaxxer',
'12_christianity_christians_church_religion',
'13_going lose_loses_lose_hopefully',
'14_biden_biden biden_saying biden_blame biden',
'15_trump voters_voting trump_trump supporters_trump supporter',
'16_fox news_fox_fake news_msnbc',
'17_kamala going_kamala_vote kamala_kamala campaign',
'18_states blue_blue states_texas_states red',
'19_abortion_ban abortion_abortion ban_abortions',
'20_won popular vote_win popular vote_got elected_popular vote',
'21_military_use military_national guard_army',
'22_trans people_transgender_transphobic_transgender people',
'23_illegal immigrant_illegal immigrants_deportations_deportation',
'24_america_america america_america really_americans',
'25_vance_vance vance_vance going_vance said',
'26_trump rapist_rapist_pedophile rapist_rapist president',
'27_repeal aca_obamacare_affordable care act_health insurance',
'28_joe rogan_rogan_like joe rogan_rogan just',
'29_matt gaetz_gaetz_gaetz just_gaetz ag',
'30_supreme court_court supreme court_justices_court supreme',
'31_voter fraud_election fraud_fraud election_rigged election',
'32_project 2025 said_project 2025 doesn_project 2025 just_project 2025 people',
'33_democracy_democracy democracy_democracy isn_vote democracy',
'34_iowa poll_iowa_wins iowa_iowan',
'35_education_educate_public education_schooling',
'36_millennials_younger generation_generation_gen gen',
'37_gun ownership_guns_buy gun_firearm',
'38_going prison_going jail_prison_sentenced',
'39_epstein_epsteins_epstein trump_jeffrey epstein',
'40_maga_maga people_maga maga_maga doing',
'41_recess appointments_recess_appointments_appointment trump',
'42_eggs_egg_eggs just_eggs going',
'43_conservatives_conservatives like_conservative_conservatives think',
'44_georgia north carolina_nc ga_ga nc_georgia',
'45_leopards face_leopards ate face_leopards eating faces_leopards eat faces',
'46_whites_white_white person_non white',
'47_puerto rico joke_puerto ricans_puerto rican_puerto rico comments',
'48_merrick garland_biden garland_garland_merrick',
'49_article isn_article actually_misleading headline_headline says',
'50_canadians_canada_canadian_moving canada',
'51_twitter_twitter used_deleted twitter_using twitter',
'52_reagan_ronald reagan_reagan trump_like reagan',
'53_cooked_cooked just_cook_let eat',
'54_betting markets_betting market_like betting_betting',
'55_tim walz_walz did_walz actually_walz said',
'56_jill stein_jill stein voters_vote stein_voted stein',
'57_porn_pornographic_pornography_porn site',
'58_latinos_latino_latino people_latinos trump',
'59_war hawks_trump said_military_guns',
'60_dad_parent_mom_parents',
'61_clown_clowns_clown circus_fucking clown',
'62_tucker carlson_tucker_like tucker_carlson said',
'63_pardon_pardoning_going pardon_pardon trump',
'64_garbage truck_drive garbage truck_driving garbage truck_driving garbage',
'65_buy stock_stock_pump dump_money trump',
'66_fbi background checks_fbi background check_fbi background_fbi',
'67_newsom_think newsom_gavin newsom_newsome',
'68_states rights_state rights_rights state_state right',
'69_orange_orange man_orange man bad_orange shit',
'70_red mirage_mirage_red_like red',
'71_fluoride water_fluoride_remove fluoride_water',
'72_tulsi gabbard_gabbard russian_intelligence tulsi_gabbard',
'73_idiocracy_idiocy_intelligence_idiots think',
'74_normal gays_normal gay_gay_normal gay guys',
'75_climate change going_climate change_global warming_climate',
'76_signs trump_trump sign_trump signs_signs harris',
'77_immigration_immigrate_permanent residency_visas',
'78_liz cheney_like liz cheney_campaigning liz cheney_cheneys',
'79_brain worm_brain worms_brainworm_worm',
'80_crosses_cross_jerusalem_crusaders',
'81_economist_economists_economics_better economy',
'82_liberal echo chamber_echo chambers_politics reddit_conservative subreddit',
'83_laws_law_law law_laws really',
'84_celebrity endorsement_celebrity endorsements_endorsements_endorsement',
'85_dementia_alzheimer_dementia don_cognitive decline',
'86_reddit_reddit reddit_reddit does_reddits',
'87_trump_trump supporter_trump supporters_voted trump',
'88_newt gingrich_gingrich_newt_politics',
'89_bezos_like bezos_jeff bezos_amazon',
'90_fuck em_fuck people_fuck fuck_fuck idiots',
'91_jail trump_prison trump_trump criminal_trump prison',
'92_blowing microphone_blow microphone_microphone say_microphone',
'93_wins trump_trump winning_win trump_trump wins',
'94_john bolton_bolton_john_patriot',
'95_oligarchy_oligarchy usa_global oligarchy_oligarchic',
'96_vote blue_voting blue_vote blue voting_blue vote',
'97_pence_mike pence_trump pence_republican',
'98_ai_artificial intelligence_train ai_ai generated',
'99_civil war_civil war like_war civil_start civil war',
'100_stephen miller_stephen miller trump_steven miller_miller',
'101_rid filibuster_filibuster_does senate_senate',
'102_tiktoks_tiktok_youtube tiktok_news tiktok',
'103_fema_hurricanes_hurricane_trump',
'104_magats_magat_magats won_maggots',
'105_woke_woke stuff_wokeism_anti woke',
'106_leopardsatemyface_leopards face_leopard_leopards',
'107_garbage_garbage yeah_say garbage_garbage garbage',
'108_bots_bot_trolls_shills',
'109_combat roles_women combat roles_women combat_combat',
'110_black voters_voters black_black vote_vote black',
'111_jim jordan_jordan_jim_like jim',
'112_mark robinson_robinson_picked_lose',
'113_voting age_voters young_younger voters_young voters',
'114_police_cops_policing_police officers',
'115_declared victory_didn won_surprised hasn_victory said',
'116_echo chamber reddit_reddit echo chamber_echo chamber_echo chambers',
'117_royalty_nobility_aristocracy_titles nobility',
'118_wins_win win_win_winning',
'119_lottery_fraud_committing fraud_lotteries',
'120_vote felon_voting felon_voted felon_convicted felon president',
'121_let burn_burn_burn let_burning']
UMAP 25, HDBSCAN 100
(
topics,
probs,
topic_representation,
topics_map,
topics_hierarchy,
topics_over_time,
plot,
) = topic_modelling(25, 100, "KeyBERTInspired")
pio.renderers.default = "notebook+vscode+jupyterlab"
plot2024-12-16 05:34:33,560 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-16 05:34:49,993 - BERTopic - Dimensionality - Completed ✓
2024-12-16 05:34:49,995 - BERTopic - Cluster - Start clustering the reduced embeddings
[I] [05:34:40.136982] Transform can only be run with brute force. Using brute force.
2024-12-16 05:35:34,871 - BERTopic - Cluster - Completed ✓
2024-12-16 05:35:34,903 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-16 05:35:54,443 - BERTopic - Representation - Completed ✓
100%|██████████| 98/98 [00:26<00:00, 3.72it/s]
16it [01:28, 5.51s/it]
topics_maptopics_hierarchytopic_representation["Name"].to_list()['-1_election_voting_voters_biden',
'0_stupid_happen_dumb_lol',
'1_democrats_blame democrats_democrat_democrats need',
'2_misogyny_women_women vote_women voted',
'3_tariffs_economy_tariff_economics',
'4_russia_like russia_russians_putin',
'5_orange vest_orange_man_face',
'6_musk_elon musk_musk trump_trump musk',
'7_voted harris_harris did_voting harris_vote harris',
'8_vote_vote vote_vote vote vote_just vote',
'9_trump voters_voting trump_trump supporters_people voted trump',
'10_hillary_woman_lady_did',
'11_nazi_nazis_fascist_fascists',
'12_gaza_gazans_palestinians_palestine',
'13_christians_christianity_christian_religious',
'14_rfk jr_rfk_kennedy_jr',
'15_saying biden_trump biden_biden_biden biden',
'16_kamala_kamala lost_kamala going_vote kamala',
'17_media outlets_fox news_media_mainstream media',
'18_illegal immigrant_illegal immigrants_illegal immigration_illegals',
'19_trans women_transgender people_trans people_transgender',
'20_america_america really_america america_fuck america',
'21_got elected_voted_didn vote_won popular vote',
'22_vance_vance doesn_vance vance_jd vance',
'23_rapist president_trump rapist_rapist_voted rapist',
'24_obamacare_repeal aca_health insurance_affordable care act',
'25_national guard_national guards_military_guard',
'26_joe rogan_rogan_rogan just_joe rogan podcast',
'27_maga essentially_maga_maga like_maga maga',
'28_supreme court_supreme courts_justices_supreme court justices',
'29_matt gaetz_gaetz confirmed_like gaetz_gaetz',
'30_states blue states_states red states_red states_blue states',
'31_project 2025 isn_project 2025 said_project 2025_project 2025 just',
'32_going prison_going jail_incarcerated_jail',
'33_democracy_democracy dead_vote democracy_democracy democracy',
'34_education_schooling_public education_schools',
'35_gen xers_genz_gen gen_gen doesn',
'36_leopards_leopards ate face_faces eaten leopards_face eating leopards',
'37_canadians_canada_canadian_moving canada',
'38_iowa_wins iowa_won iowa_iowa poll',
'39_guns_buy gun_firearms_firearm',
'40_epstein_epstein trump_epsteins_trump epstein',
'41_voter fraud_election fraud_fraud election_rigged election',
'42_eggs_eggs yeah_yeah eggs_eggs just',
'43_puerto rico joke_called puerto rico_puerto ricans_puerto rican',
'44_headline_accurate headline_misleading headline_read headline',
'45_white person_white_white man_whites',
'46_recess appointments_make recess appointments_allow recess appointments_recess appointment',
'47_conservatives_conservatives like_liberals conservatives_conservative',
'48_texas_texas texas_tx_texas going',
'49_merrick garland_merrick garland ag_fuck garland_garland did',
'50_georgia_ga_state georgia_ga nc',
'51_betting markets_betting market_people betting_betting',
'52_reagan_reagan did_voted reagan_ronald reagan',
'53_reddit echo chamber_echo chamber reddit_echo chamber_echo chambers',
'54_latino_hispanic latino_latinos_latino people',
'55_twitter_twitter twitter_deleted twitter_twitter used',
'56_cooked_cooked just_cook_meals',
'57_tim walz_walz actually_walz_like walz',
'58_jill stein_jill stein just_voted jill stein_voting jill stein',
'59_fbi_fbi background_fbi background checks_trump fbi',
'60_tucker carlson_tucker really_tucker_like tucker',
'61_war hawks_trump said_guns trained_military',
'62_mom_mom dad_parent_mother',
'63_veterans trump_benefits veterans_veterans voted_veterans benefits',
'64_pardon trump_trump pardon_pardoned trump_president pardon',
'65_newsom_like newsom_think newsom_2028 newsom',
'66_porn_pornographic_pornography_porn just',
'67_garbage truck_truck garbage_trash truck_drive garbage truck',
'68_law yeah_law isn_law_law illegal',
'69_pump dump_pumping stock_dumping money_dump',
'70_states rights_state rights_states rights states_rights states',
'71_fluoride water_water fluoridation_fluoride drinking water_fluoride',
'72_crosses_cross_jerusalem_crusaders',
'73_idiocracy_living idiocracy_idiocracy america_movie idiocracy',
'74_prison trump_trump prison_trump jail_sentence trump',
'75_parents trump_mom voted_voted trump_maga family',
'76_gay_gay gay_people gay_gay people',
'77_brain worm_worm brain_brain worms_worm ate brain',
'78_liz cheney_like liz cheney_campaigning liz cheney_dick liz cheney',
'79_celebrity endorsement_celebrity endorsements_endorsements like_endorsement like',
'80_fema workers_fema just_fema_maga',
'81_climate change_climate change going_like climate change_climate change real',
'82_newt gingrich_gingrich_newt_paul ryan',
'83_dictator_dictator day_said dictator day_dictator life',
'84_blue blue wave_blue wave_wave blue_red wave blue',
'85_signs trump_signs trump signs_trump sign_trump signs',
'86_john bolton_john bolton dick_bolton dick cheney_bolton suggests',
'87_economist_economists_economically liberal_liberal',
'88_pence_trump pence_mike pence_pence right',
'89_getting rid filibuster_rid filibuster_eliminating filibuster_filibuster',
'90_blew mic_blowing mic_mic_blow mic',
'91_garbage_yeah garbage_trash garbage_let garbage',
'92_oligarchy_oligarchy just_american oligarchy_america oligarchy',
'93_magats really_magats_magats going_thing magats',
'94_clown_clowns_clowned_clowning',
'95_vote blue_voting blue_just vote blue_blue vote',
'96_illegal lottery_lottery illegal_running illegal lottery_lottery',
'97_businessman_richest man_richest_billionaire',
'98_tiktok_tiktoks_youtube tiktok_news tiktok']
UMAP 30, HDBSCAN 75
(
topics,
probs,
topic_representation,
topics_map,
topics_hierarchy,
topics_over_time,
plot,
) = topic_modelling(30, 75, "KeyBERTInspired")
pio.renderers.default = "notebook+vscode+jupyterlab"
plot2024-12-16 05:03:21,353 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-16 05:03:38,813 - BERTopic - Dimensionality - Completed ✓
2024-12-16 05:03:38,815 - BERTopic - Cluster - Start clustering the reduced embeddings
[I] [05:03:27.917795] Transform can only be run with brute force. Using brute force.
2024-12-16 05:04:27,475 - BERTopic - Cluster - Completed ✓
2024-12-16 05:04:27,504 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-16 05:04:49,547 - BERTopic - Representation - Completed ✓
100%|██████████| 119/119 [00:22<00:00, 5.26it/s]
16it [01:29, 5.59s/it]
topics_maptopics_hierarchytopic_representation["Name"].to_list()['-1_democrats_candidate_voters_trump',
'0_lol_haha_good_thank',
'1_tariffs_inflation_tariff_gas prices',
'2_bernie_democrats_democrat_bernie sanders',
'3_musk_elon musk_like musk_trump musk',
'4_trump_trump trump_trump going_trump just',
'5_vote harris_voting harris_voted harris_harris campaign',
'6_hillary_shes_woman_trump',
'7_putin_russia_trump putin_putins',
'8_gaza_palestinians_palestine_hamas',
'9_vote_vote vote_voting_vote vote vote',
'10_christians_christian_christianity_christian values',
'11_rfk_rfk jr_kennedy_jfk',
'12_joe biden_biden_biden biden_biden called',
'13_vote kamala_voting kamala_voted kamala_kamala campaign',
'14_fox news_fox_mainstream media_msnbc',
'15_going lose_lose_loses_losing',
'16_illegal immigrants_illegal immigration_illegals_deportations',
'17_won popular vote_did vote_win popular vote_didn vote',
'18_abortion_ban abortion_abortion bans_abortion ban',
'19_trans people_transgender people_trans woman_trans rights',
'20_america_america america_americans_american',
'21_rapist_voted rapist_trump rapist_pedophile rapist',
'22_vance_vance vance_like vance_vance going',
'23_obamacare_affordable care act_repeal aca_health insurance',
'24_joe rogan_like joe rogan_rogan_rogan just',
'25_nazi_nazis_hitler_trump hitler',
'26_supreme court_justices_supreme court trump_supreme court justices',
'27_fascist_fascism_fascists_fascist government',
'28_matt gaetz_gaetz_gaetz just_gaetz ag',
'29_women voted trump_women voted_women vote_women voting',
'30_project 2025 going_project 2025 just_project 2025_read project 2025',
'31_military_use military_military leadership_people military',
'32_genz_gen voters_gen_millennials',
'33_maga_maga maga_maga just_maga like',
'34_leopards eating faces_leopards ate face_leopards_leopard eating face',
'35_democracy_vote democracy_democracy democracy_democracy dead',
'36_women_women hate_hates women_women just',
'37_voter fraud_election fraud_fraud election_rigged election',
'38_education_department education_dept education_schooling',
'39_iowa poll_iowa_selzer poll_polls',
'40_mail voting_voting mail_early voting_vote early',
'41_epstein_epsteins_like epstein_release epstein',
'42_guns_firearms_gun ownership_gun',
'43_states red states_red states_blue states_states red',
'44_going prison_goes prison_going jail_prison',
'45_eggs_egg_eggs just_eggs won',
'46_canada_canadian_moving canada_usa',
'47_puerto rico comments_puerto ricans_puerto rican voters_puerto rican',
'48_white people_white middle class_white males_white man',
'49_twitter_use twitter_twitter facebook_twitter account',
'50_article isn_stupid article_article just_headline says',
'51_conservatives_conservative_conservatives just_liberals conservatives',
'52_reagan_reagan trump_ronald reagan_like reagan',
'53_merrick garland_biden garland_garland_merrick',
'54_betting markets_betting_bets_betting sites',
'55_latinos voted_latino vote_latino voters_latinos',
'56_texas_texas texas_tx_lose texas',
'57_jill stein_vote stein_voted stein_stein voters',
'58_tim walz_walz actually_walz didn_walz',
'59_echo chamber_echo chambers_echo chamber reddit_reddit echo chamber',
'60_ga nc_georgia_nc_ga',
'61_war hawks_guns trained_shooting_guns',
'62_liz cheney_liz cheney like_like liz cheney_dick liz cheney',
'63_orange vest_orange face_orange man_orange',
'64_tucker carlson_like tucker_tucker_carlson',
'65_kids_children_children don_like kids',
'66_porn_pornographic_pornography_porn site',
'67_fbi background checks_fbi background check_fbi background_fbi',
'68_peppers_sauce_cooking_spicy',
'69_law isn_law yeah_laws_law doesn',
'70_mitch mcconnell_mcconnell_republican senator_senate does',
'71_states rights_state rights_rights state_state right',
'72_pardon trump_pardoning_pardon_going pardon',
'73_recess appointments_recess appointments senate_recess_appointments trump',
'74_newsom_gavin newsom_newsome_probably',
'75_parents trump_voted trump_maga family_trump',
'76_crosses_cross_jerusalem_crusade',
'77_fluoride water_fluoride drinking water_fluoride_fluoride drinking',
'78_veterans trump_vets voted_veterans_veteran',
'79_tulsi gabbard_gabbard russian_tulsi_gabbard said',
'80_normal gay_gay_normal gay guy_normal gays',
'81_brain worm_brain worms_brainworm_brainworms',
'82_celebrity endorsements_endorsements_endorsement_endorsed',
'83_idiocracy_idiots_stupid_idiot truly',
'84_garbage truck_drive garbage truck_trash truck_driving garbage truck',
'85_climate change_climate change going_fight climate change_fight climate',
'86_newt gingrich_gingrich_newt_paul ryan',
'87_blue wave_red wave_blue tsunami_wave',
'88_clown_clown actually_fucking clown_clowns',
'89_economist_economists_economics_better economy',
'90_john bolton_bolton_patriot_worse trump',
'91_garbage_garbage oh_trash garbage_say garbage',
'92_filibuster_filibustered_rid filibuster_filibuster gone',
'93_dictator_dictator day_dictator life_dictators',
'94_free fair election_free fair elections_fair election_free election',
'95_pence_mike pence_trump pence_hang mike pence',
'96_oligarchy_america oligarchy_global oligarchy_oligarch',
'97_vote blue_voting blue_blue vote_blue voting',
'98_lottery illegal_illegal lottery_lottery_lotteries',
'99_mark robinson_robinson_mark_win',
'100_trump sign_signs trump_harris signs_trump signs',
'101_woke_woke stuff_anti woke_wokeism',
'102_fraud_fraud fraud_fraud say_fraud right',
'103_stephen miller_stephen miller trump_steven miller_miller',
'104_tiktoks_tiktok_tik tok_tik',
'105_jim jordan_jordan_jim_gym',
'106_microphone blow job_giving microphone_microphone blow_microphone',
'107_won likely_won_won surprised_win isn',
'108_ai_ai stuff_ai just_thing ai',
'109_civil war_start civil war_american civil war_confederate',
'110_let burn_burn_burning_burned',
'111_bots_bot_bots trolls_trolls',
'112_cops_police_police officers_policing',
'113_voting age_young people vote_young voters_younger voters',
'114_reddit_reddit does_reddit reddit_reddit just',
'115_got dementia_dementia_dementia riddled_alzheimer',
'116_magats_magats going_magat_magats don',
'117_second wife_cheating wife_wife_cheated wife',
'118_prison trump_jail trump_trump prison_trump jail',
'119_owning libs_owned libs_libs_just libs']
UMAP 30, HDBSCAN 100
(
topics,
probs,
topic_representation,
topics_map,
topics_hierarchy,
topics_over_time,
plot,
) = topic_modelling(30, 50, "KeyBERTInspired")
pio.renderers.default = "notebook+vscode+jupyterlab"
plot2024-12-16 05:17:50,509 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-16 05:18:07,920 - BERTopic - Dimensionality - Completed ✓
2024-12-16 05:18:07,923 - BERTopic - Cluster - Start clustering the reduced embeddings
[I] [05:17:58.095146] Transform can only be run with brute force. Using brute force.
2024-12-16 05:18:51,560 - BERTopic - Cluster - Completed ✓
2024-12-16 05:18:51,592 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-16 05:19:14,737 - BERTopic - Representation - Completed ✓
100%|██████████| 182/182 [00:27<00:00, 6.71it/s]
16it [01:39, 6.19s/it]
topics_maptopics_hierarchytopic_representation["Name"].to_list()['-1_democrats_trump_maga_republicans',
'0_bernie_bernie sanders_democrats_blame democrats',
'1_harris trump_trump harris_voted harris_vote harris',
'2_hillary_candidate_election_votes',
'3_gaza_support israel_israel palestine_palestinians',
'4_russia_russians_putin_like russia',
'5_biden_blame biden_biden fault_biden biden',
'6_christians_christian_christianity_religious',
'7_rfk jr_rfk_jr_kennedy',
'8_women_women women_ladies_women just',
'9_inflation_inflationary_raise prices_fed',
'10_national guard_military_use military_military coup',
'11_vote vote_vote_vote vote vote_voting',
'12_tells_tells like_says_said',
'13_kamala going_kamala_vote kamala_kamala campaign',
'14_fox news_fox_mainstream media_media outlets',
'15_trump trump_trump_trump president_trump isn',
'16_won popular vote_winning popular vote_vote won_win popular vote',
'17_ban abortion_abortion bans_abortion_abortion ban',
'18_musk_elon musk_musk trump_trump musk',
'19_trans people_transgender people_trans rights_transgender',
'20_illegal immigrants_illegal immigrant_illegal immigration_illegals',
'21_vance_vance said_trump vance_jd vance',
'22_texas_tx_states blue_texan',
'23_rapist_voted rapist_child rapist_rapists',
'24_america_america america_america fuck_america really',
'25_thank_thanks_thank sharing_welcome',
'26_hitler_trump hitler_nazi_nazis',
'27_fascist_fascism_fascists_fascism trump',
'28_supreme court_supreme court justices_justices_supreme court ruling',
'29_joe rogan_rogan_like rogan_joe',
'30_matt gaetz_gaetz_matt_gabbard',
'31_elon_like elon_elon said_elon just',
'32_voter fraud_election fraud_rigged election_election rigged',
'33_project 2025 going_project 2025_project 2025 just_read project 2025',
'34_maga idiots_maga morons_maga garbage_fuck maga',
'35_genz_gen_millennials_generation',
'36_democracy isn_democracy_isn democracy_vote democracy',
'37_leopards ate face_leopards_leopardsatemyface_leopards ate',
'38_education_public education_schooling_educate',
'39_obamacare_affordable care act_health insurance_healthcare plan',
'40_iowa poll_iowa_polls_polling',
'41_yes did_did yes_sure did_actually did',
'42_guns_firearms_armed_gun',
'43_epstein_jeffrey epstein_michael_hollywood',
'44_eggs_egg_price eggs_chickens',
'45_going prison_incarcerated_prison_prison sentence',
'46_people stupid_stupidity people_stupid people_stupidity',
'47_puerto rico_puerto rican_puerto ricans_puerto',
'48_headline_headline like_like headline_read headline',
'49_early voting_mail voting_voting early_vote early',
'50_sad_really sad_just sad_fucking sad',
'51_twitter_twitter account_like twitter_tweet',
'52_white people_people white_white person_whites',
'53_conservatives_conservative_conservatives liberals_liberals conservatives',
'54_merrick garland_garland_merrick_partisan',
'55_echo chamber_echo chambers_reddit echo chamber_reddit echo',
'56_nc ga_ga nc_georgia_nc',
'57_reagan_reagan trump_ronald reagan_reagan won',
'58_canada_canadian_usa_america',
'59_betting_bets_place bets_betting trump',
'60_orange_orange man_orange fuck_orange face',
'61_tim walz_walz_tim_didn',
'62_jill stein_candidate_party candidate_candidates',
'63_billionaires_billionaires don_billionaire_billionaire class',
'64_latino voters_latinos_latino_hispanics',
'65_war hawks_guns_shooting_troops',
'66_cooked_cook_food_meat',
'67_2016_happened 2016_2016 2020_2020 2016',
'68_mom_mother_mum_parent',
'69_shocked_shocked shocked_shocking_shocked just',
'70_states rights_state rights_rights state_state right',
'71_liz cheney_cheney_cheneys_dick cheney',
'72_tucker carlson_tucker_like tucker_carlson',
'73_fbi_background checks_background check_security clearance',
'74_joking_joke_just joke_joke did',
'75_porn_pornography_ban porn_porn star',
'76_newsom_gavin newsom_newsome_whitmer',
'77_law isn_laws people_law yeah_laws',
'78_stock_stock market_sells_shareholders',
'79_pardon trump_pardon_pardoning_pardons',
'80_voting trump_voted trump_vote trump_voted trump 2016',
'81_taxes_tax_pay taxes_paying taxes',
'82_crosses_cross_crusade_crusades',
'83_garbage truck_truck_trucks_trash',
'84_gay_gay guy_gay man_gay men',
'85_tulsi gabbard_tulsi_gabbard_russian asset',
'86_idiocracy_idiocy_idiots_stupid',
'87_blue wave_wave_waves_blue',
'88_years years_years lol_years left_years',
'89_fluoride_water_drink_milk',
'90_brain worm_brain worms_worm_worms',
'91_recess_appointments_appointment_appoint',
'92_celebrity endorsements_endorsements_endorsement_celebrities',
'93_future elections_election_election election_fair election',
'94_climate change_global warming_climate_climate action',
'95_clown_clowns_fucking clown_clown car',
'96_economist_economists_economics_endorsed',
'97_newt_moral_paul ryan_ryan',
'98_affordable housing_housing crisis_housing_housing market',
'99_let burn_burn_burning_burned',
'100_fema_hurricane_hurricanes_disasters',
'101_garbage_garbage garbage_called garbage_calling garbage',
'102_blame_continue blame_say blame_don blame',
'103_john bolton_bolton_trump bad_patriot',
'104_pence_mike pence_republican_republicans',
'105_reddit_reddit reddit_reddit just_reddit like',
'106_tiktok_tik tok_tik_tweets',
'107_drain swamp_swamp_drained_clearly trump',
'108_woke_anti woke_wokeness_wake',
'109_lottery_fraud_fraudulently_fraudulent',
'110_vote blue_voting blue_blue vote_voted blue',
'111_mic_mic stand_microphones_microphone',
'112_cops_police_policing_cop',
'113_declaring victory_declare victory_win isn_won isn',
'114_oligarchy_oligarch_oligarchs_billionaire oligarchs',
'115_stephen miller_miller_stephen_trump soon',
'116_dementia_alzheimer_dementia don_demented',
'117_civil war_american civil_confederate_confederacy',
'118_signs trump_trump sign_trump signs_trump ones',
'119_businessman_richest man_doing business_business',
'120_filibuster_senate_house senate_senators',
'121_owning libs_owned libs_libs_libs just',
'122_fraud_fraudulent_fraudster_fraud claims',
'123_election day_vote day_voting day_day election',
'124_mark robinson_robinson_couldn vote_picked trump',
'125_trump won_wins trump_trump wins_trump win',
'126_jim jordan_jordan_jim_gym',
'127_twice impeached_impeached_impeachment_impeach',
'128_ai_artificial_bot_intelligent',
'129_fingers crossed_hoping_looking forward_soon',
'130_magats_magat_fuckin_frat',
'131_convicted felon_felon_felons_felonies',
'132_bots_bot_trolls_ai',
'133_taliban_afghanistan_american_fought',
'134_leak_leaking_leaked_leaks',
'135_crime trump_trump prison_trump_trump committed',
'136_young voters_younger voters_voting age_youth vote',
'137_late_better late_far late_little late',
'138_wife_wives_married_ex wife',
'139_percent_percentages_15_rate',
'140_bezos_jeff bezos_amazon_billionaires',
'141_pelosi_nancy pelosi_feel democrats_bernie',
'142_concepts plan_concept plan_plan plan_concepts',
'143_missile defense_missile_missiles_walker',
'144_collins_susan_shocked_worst person',
'145_newsweek_newsweek article_journalism_bothered read',
'146_lindsey graham_graham_lindsey_lindsay',
'147_social security_medicare social security_social security medicare_medicare social',
'148_palpatine_darth_vader_star wars',
'149_monarchy_aristocracy_orders_order',
'150_fuck em_fuck people_fuck fuck_fuck',
'151_frivolous lawsuits_lawsuits_lawsuit_suing',
'152_working class_working class american_help working class_class working class',
'153_onion_actual news_satire_shit says',
'154_mitch mcconnell_mcconnell_mitch_senator',
'155_alex jones_infowars_jones_alex',
'156_electoral college_electoral_electors_elections',
'157_china_taiwan_chinese_world',
'158_isn going win_win pa_wins_kamala win',
'159_recount_recounts_ballots_votes',
'160_leon_involved_paid_corpse',
'161_podcasters_podcaster_podcast_podcasts',
'162_justice_injustices_injustice_court justice',
'163_john oliver_oliver_john_says',
'164_cheating_cheated_cheat_cheats',
'165_couch_couches_furniture_like jd',
'166_video clip_clip_clips_watched clip',
'167_rules thee_rule_rules just_make rules',
'168_domestic terrorists_domestic terrorism_domestic terrorist_terrorists',
'169_great idea_idea__',
'170_drunk_drinking_sober_alcohol',
'171_bought twitter_twitter got_twitter_twitter don',
'172_golf_golf course_play golf_golfing',
'173_immigration_passport_visas_visa',
'174_elmo_furry_twitter_created',
'175_citizens united_citizen united_citizens_citizen',
'176_economy trump_economy bad_republican voters_democrats better',
'177_political party_parties_party reason_major parties',
'178_camera_cameras_man_bad man',
'179_dc_democrat_voted democrat_maryland',
'180_worked trump_despises trump_trump fully_terms trump',
'181_brexit_britain_uk_eu',
'182_copium_hopium_smoking_quite bit']